library(here)
## here() starts at /Users/hyunhwan/Projects/InProgress/CB2-Experiments/01_gene-level-analysis
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(CB2)
library(CRISPhieRmix)
library(pheatmap)
Load a data file. This is intially from https://doi.org/10.1016/j.celrep.2018.06.027.
df_read <- read_tsv(here("../02_sgRNA-level-analysis/dat/Gsk3_readcount.txt"))
## Parsed with column specification:
## cols(
## gRNA = col_character(),
## Gene = col_character(),
## Before_1 = col_double(),
## Before_2 = col_double(),
## Before_3 = col_double(),
## Before_4 = col_double(),
## Pos_1 = col_double(),
## Pos_2 = col_double(),
## Pos_3 = col_double(),
## Pos_4 = col_double()
## )
df_count <- df_read %>% unite("id", c("Gene", "gRNA")) %>% column_to_rownames("id")
df_design <- tibble(sample_name = colnames(df_count)) %>%
separate("sample_name", c("group", "rep"), sep = "_", remove = F)
df_design
Run CB2
sgstat <- run_estimation(df_count, df_design, "Before", "Pos")
sgstat
genestat <- measure_gene_stats(sgstat)
genestat
summary(sgstat$p_ts)
## V1
## Min. :0.0000
## 1st Qu.:0.2968
## Median :0.5393
## Mean :0.5311
## 3rd Qu.:0.7752
## Max. :1.0000
Running CRISPhieRmix
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
##
## simplify
## The following objects are masked from 'package:base':
##
## aperm, apply
dds <- DESeqDataSetFromMatrix(df_count, df_design, ~group)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
df_ret <- results(dds) %>% as.data.frame
head(df_ret)
summary(df_ret$pvalue)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.3165 0.5734 0.5470 0.7938 1.0000 1838
df_ret$log2FoldChange[is.na(df_ret$log2FoldChange)] <- 0
df_cmix <- df_ret %>% rownames_to_column("id") %>%
separate("id", c("gene", "gRNA"), sep = "_", extra="merge") %>% as.data.frame
df_cmix$gene <- factor(df_cmix$gene, levels = unique(df_cmix$gene))
cmix <- CRISPhieRmix(df_cmix$log2FoldChange, df_cmix$gene)
## Loading required package: mixtools
## mixtools package, version 1.1.0, Released 2017-03-10
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
## number of iterations= 36
df_cmixret <- tibble(gene = cmix$genes, locfdr = cmix$locfdr, fdr = cmix$FDR)
df_merge <- genestat %>% left_join(df_cmixret, by="gene")
## Warning: Column `gene` joining character vector and factor, coercing into
## character vector
df_merge %>% ggplot(aes(x=-log10(fdr_ts), y=-log10(fdr))) +
geom_point() + xlab("CB2") + ylab("CRISPhieRmix")
As we have seen below, CB2 detect 205 genes as hits, and CRIPSPhieRmix detect 3,426 genes.
print(sum(genestat$fdr_ts<0.1))
## [1] 205
print(sum(df_cmixret$fdr<0.1))
## [1] 3426
Only five genes were uniquely detected by CB2.
df_merge %>% filter(fdr_ts < 0.1, fdr < 0.1) %>% nrow
## [1] 200
Visualize heatmap (scaled by each row)
genes <- df_merge %>% filter(fdr < 0.1,fdr_ts > 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "0610009D07Rik" "0610010F05Rik" "1110004E09Rik" "1110008J03Rik"
## [5] "1110032A03Rik" "1110059E24Rik"
print(length(genes))
## [1] 3226
df_norm <- df_count
for(j in 1:ncol(df_norm)) {
df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}
df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>%
filter(gene %in% genes) %>%
select(-gene) %>%
column_to_rownames("gRNA") -> df_heatmap
df_heatmap[rowSums(df_heatmap) > 0,] %>%
pheatmap(show_rownames = F, scale = "row", main = "sgRNA heatmap of hit genes uniquely detected by CRISPhieRmix")
genes <- df_merge %>% filter(fdr < 0.1,fdr_ts < 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "1110008L16Rik" "1110037F02Rik" "1810043H04Rik" "Aars2"
## [5] "Aco2" "Adck4"
print(length(genes))
## [1] 200
df_norm <- df_count
for(j in 1:ncol(df_norm)) {
df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}
df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>%
filter(gene %in% genes) %>%
select(-gene) %>%
column_to_rownames("gRNA") -> df_heatmap
df_heatmap[rowSums(df_heatmap) > 0,] %>%
pheatmap(show_rownames = F, scale = "row", main = "sgRNA heatmap of hit genes detected by both.")
Visualize heatmap (no-scaling)
genes <- df_merge %>% filter(fdr < 0.1,fdr_ts > 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "0610009D07Rik" "0610010F05Rik" "1110004E09Rik" "1110008J03Rik"
## [5] "1110032A03Rik" "1110059E24Rik"
print(length(genes))
## [1] 3226
df_norm <- df_count
for(j in 1:ncol(df_norm)) {
df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}
df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>%
filter(gene %in% genes) %>%
select(-gene) %>%
column_to_rownames("gRNA") -> df_heatmap
df_heatmap[rowSums(df_heatmap) > 0,] %>%
pheatmap(show_rownames = F, scale = "none", main = "sgRNA heatmap of hit genes uniquely detected by CRISPhieRmix")
genes <- df_merge %>% filter(fdr < 0.1,fdr_ts < 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "1110008L16Rik" "1110037F02Rik" "1810043H04Rik" "Aars2"
## [5] "Aco2" "Adck4"
print(length(genes))
## [1] 200
df_norm <- df_count
for(j in 1:ncol(df_norm)) {
df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}
df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>%
filter(gene %in% genes) %>%
select(-gene) %>%
column_to_rownames("gRNA") -> df_heatmap
df_heatmap[rowSums(df_heatmap) > 0,] %>%
pheatmap(show_rownames = F, scale = "none", main = "sgRNA heatmap of hit genes detected by both.")